In [2]:
import pandas as pd
import numpy as np
import geopandas as gpd
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
from mpl_toolkits.axes_grid1 import make_axes_locatable
import seaborn as sns
from ipywidgets import interact
from mpl_toolkits.basemap import Basemap
import datetime
from scipy.stats import linregress

import plotly.express as px
import plotly
import plotly.graph_objects as go
import datetime
import warnings
warnings.filterwarnings("ignore")
In [3]:
df = pd.read_csv("city_temperature.csv")
df
Out[3]:
Region Country State City Month Day Year AvgTemperature
0 Africa Algeria NaN Algiers 1 1 1995 64.2
1 Africa Algeria NaN Algiers 1 2 1995 49.4
2 Africa Algeria NaN Algiers 1 3 1995 48.8
3 Africa Algeria NaN Algiers 1 4 1995 46.4
4 Africa Algeria NaN Algiers 1 5 1995 47.9
... ... ... ... ... ... ... ... ...
2906322 North America US Additional Territories San Juan Puerto Rico 7 27 2013 82.4
2906323 North America US Additional Territories San Juan Puerto Rico 7 28 2013 81.6
2906324 North America US Additional Territories San Juan Puerto Rico 7 29 2013 84.2
2906325 North America US Additional Territories San Juan Puerto Rico 7 30 2013 83.8
2906326 North America US Additional Territories San Juan Puerto Rico 7 31 2013 83.6

2906327 rows × 8 columns

In [4]:
df['Country'].value_counts()
Out[4]:
Country
US                   1455337
Canada                 74245
Australia              46330
China                  46329
India                  37063
                      ...   
Guyana                  5065
Israel                  4641
Burundi                 4543
Georgia                 4378
Serbia-Montenegro       3427
Name: count, Length: 125, dtype: int64
In [5]:
df.duplicated().sum()
Out[5]:
20715
In [6]:
df = df.drop_duplicates()
In [7]:
df.isnull().sum()
Out[7]:
Region                  0
Country                 0
State             1448805
City                    0
Month                   0
Day                     0
Year                    0
AvgTemperature          0
dtype: int64
State column has many null values. That is okay in the scope of the current analysis¶
In [8]:
plt.figure(figsize = (8,10))
sns.boxplot(data = df, y = 'AvgTemperature')
plt.title('Boxplot of Average Temperature')
Out[8]:
Text(0.5, 1.0, 'Boxplot of Average Temperature')
No description has been provided for this image
Outliers are present and can be seen in the figure above. Let's choose all the rows having 'AvgTemperature' greater than -70°F¶
In [9]:
df = df[df['AvgTemperature']>-70]
In [10]:
df.sort_values(['Year','Month','Day'])
Out[10]:
Region Country State City Month Day Year AvgTemperature
0 Africa Algeria NaN Algiers 1 1 1995 64.2
13809 Africa Benin NaN Cotonou 1 1 1995 81.2
23075 Africa Central African Republic NaN Bangui 1 1 1995 75.3
32341 Africa Congo NaN Brazzaville 1 1 1995 79.9
41606 Africa Egypt NaN Cairo 1 1 1995 59.2
... ... ... ... ... ... ... ... ...
2862479 North America US Wisconsin Green Bay 5 13 2020 38.5
2871744 North America US Wisconsin Madison 5 13 2020 45.7
2881009 North America US Wisconsin Milwaukee 5 13 2020 41.2
2890274 North America US Wyoming Casper 5 13 2020 54.1
2899539 North America US Wyoming Cheyenne 5 13 2020 48.5

2806369 rows × 8 columns

¶

The date spans from Jan 1995 to May 2020 Since 2020 is incomplete, let's strip it off for the current analysis

In [11]:
df = df[df['Year']<2020]

Avg. Temp of the world over the years 1995 to 2019¶

In [12]:
data = df[['Year','AvgTemperature']].groupby('Year').mean()
linfit = np.polyfit(data.index,data['AvgTemperature'],deg=1)
linfit = linfit[0]*data.index + linfit[1]

np.random.seed(0)
dates = pd.date_range(start='1995-01-01', end='2019-12-31', freq='M')
data = pd.DataFrame({'Date': dates, 'Temperature': np.random.normal(15, 5, len(dates))})

# Compute linear fit
slope, intercept, _, _, _ = linregress(data.index, data['Temperature'])
linfit = slope * data.index + intercept

# Plotting using Seaborn and Matplotlib
plt.figure(figsize=(10, 6))
sns.lineplot(data=data, x='Date', y='Temperature', label='Average Temperature')
plt.plot(data['Date'], linfit, label='Linear Fit', linestyle='--')
plt.title('Average Temperature of the World from 1995 to 2019')
plt.xlabel('Date')
plt.ylabel('Temperature')
plt.legend()
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
No description has been provided for this image
In [13]:
data = df[['Region','AvgTemperature','Year']].groupby(['Region','Year']).mean()
data = data.reset_index(level = 0)

sns.set_style('darkgrid')
plt.figure(figsize=(12,8))
sns.lineplot(data = data, hue ='Region', x ='Year',y = 'AvgTemperature')
plt.title('Average temperature across different regions')
plt.tight_layout()
No description has been provided for this image

Plot - Yearly average temperature graphs for different regions¶

In [14]:
data =df [['Region','Month','AvgTemperature']]
data = data.groupby(['Region','Month']).mean()

months = {1:'Jan',2:'Feb',3:'Mar',4:'Apr',5:'May',6:'Jun',7:'Jul',8:'Aug',9:'Sep',10:'Oct',11:'Nov',12:'Dec'}
data = data.reset_index(level = 1)

data['Month'] = data.loc[:,'Month'].map(months)
data.head(3)
Out[14]:
Month AvgTemperature
Region
Africa Jan 72.540331
Africa Feb 73.689526
Africa Mar 74.704641
In [15]:
regions = df['Region'].unique()
plot_holder = []
colors = ['green','red','firebrick','blue','teal','black','orange']
for i,region in enumerate(regions):
    plot = go.Bar(x=data.loc[region,'Month'],y=data.loc[region,'AvgTemperature'],name=region,visible = (i==0),marker_color=colors[i])
    plot_holder.append(plot)
fig = go.Figure(data = plot_holder)

buttons = [dict(label=region,method='update',args=[{'title':region,'visible':[True if j==i else False for j in range(len(regions))]}]) for i,region in enumerate(regions)]
updatemenus = list([
    dict(
    buttons=buttons,
    yanchor='top',
    xanchor='left')
])
fig.update_layout(updatemenus=updatemenus)

Africa and central america has nearly the same average temperature throught the year, Australia shows a reverse trend than others.¶

Let's display the highest and lowest recorded temperature in this dataset (1995 to 2020)¶

In [62]:
data1 = df.sort_values(by=['AvgTemperature'],ascending=False).head(1)
data2 = df.sort_values(by=['AvgTemperature'],ascending=True).head(1)
data = pd.concat([data1,data2])
data.index = ['Highest','Lowest']
data
Out[62]:
Region Country State City Month Day Year AvgTemperature
Highest Middle East Kuwait NaN Kuwait 8 1 2012 110.0
Lowest North America US Alaska Fairbanks 12 31 1999 -50.0
In [103]:
map = Basemap(llcrnrlon=-180, llcrnrlat=-90, urcrnrlon=180, urcrnrlat=90)

# Create a figure and axes
fig, ax = plt.subplots(figsize=(10, 6))

# Draw the map boundaries, coastlines, and countries
map.drawmapboundary(fill_color='white')
map.drawcoastlines()
map.drawcountries()

# Define the coordinates, text, size, and color for scatter points
lat = [29.3117, 64.8378]
lon = [29.3117, 147.7164]
text = ['Kuwait', 'Fairbanks']
size = [100, 100]  # Increase the size for better visibility
color = [110, -50]

# Convert coordinates to map projection
x, y = map(lon, lat)

# Scatter plot with size and color
scatter = ax.scatter(x, y, s=size, c=color, cmap='coolwarm', alpha=0.75, edgecolors='k')

# Add text labels to the scatter points
for i, txt in enumerate(text):
    ax.annotate(txt, (x[i], y[i]), fontsize=10, ha='center', va='bottom')

# Add a colorbar for the color scale
cbar = plt.colorbar(scatter, ax=ax, orientation='vertical')
cbar.set_label('Color Scale')

# Set plot title
plt.title('Geographical Scatter Plot')

# Show the plot
plt.show()
No description has been provided for this image
Kuwait on 1st August 2012 recorded the highest temperature of 110°F and Fairbanks on 31st December 1999 recorded the lowest.¶
In [39]:
data = df[['Country','AvgTemperature']].groupby('Country').mean().sort_values('AvgTemperature')
print('The top five coldest countries in the world are: ',data.index[:5].to_list())
print('The top five hottest countries in the world are: ',data.index[-5:].to_list())
The top five coldest countries in the world are:  ['Mongolia', 'Iceland', 'Norway', 'Canada', 'Finland']
The top five hottest countries in the world are:  ['Guyana', 'Indonesia', 'Thailand', 'Nigeria', 'Haiti']
Top 5 coldest countries over the years¶
In [16]:
data = df[['Country','Year','AvgTemperature']].groupby(['Year','Country']).mean().sort_values('AvgTemperature').reset_index()
data = data.groupby('Year').apply(lambda group: group.iloc[:5]).reset_index(drop=True)
px.choropleth(data_frame=data,locationmode='country names',locations='Country',color='AvgTemperature',animation_frame='Year',title="Top five coldest countries over the years")
Top hottest 5 countries over the years¶
In [17]:
data = df[['Country','Year','AvgTemperature']].groupby(['Year','Country']).mean().sort_values('AvgTemperature',ascending=False).reset_index()
data = data.groupby('Year').apply(lambda group: group.iloc[:5]).reset_index(drop=True)
px.choropleth(data_frame=data,locationmode='country names',locations='Country',color='AvgTemperature',animation_frame='Year',title="Top five hottest countries over the years")

Choropleth plot of countrywise average temperature over the years 1995 to 2019¶

In [18]:
data = df[['Country','Year','AvgTemperature']].groupby(['Country','Year']).mean().reset_index()
px.choropleth(data_frame=data,locations="Country",locationmode='country names',animation_frame="Year",color='AvgTemperature',color_continuous_scale = 'Turbo',title="Choropleth plot of countrywise average temperature over the years 1995 to 2019")
Let's narrow the analysis to Mumbai city in India¶

Selecting data points of mumbai city

In [19]:
data = df[df['Country'] == 'India']
bom = data.loc[data['City'] == 'Bombay (Mumbai)',['Month','Day','Year','AvgTemperature']].reset_index(drop=True)
bom.head(3)
Out[19]:
Month Day Year AvgTemperature
0 1 1 1995 71.8
1 1 2 1995 72.0
2 1 3 1995 70.3
In [20]:
size = bom.groupby(['Year','Month']).size().reset_index()
size_max = size[0].max()
size_min = size[0].min()
n = size_max - size_min +1
cmap = sns.color_palette("deep",n)
size = size.pivot(index='Year',columns='Month',values=0)
size.head(3)
Out[20]:
Month 1 2 3 4 5 6 7 8 9 10 11 12
Year
1995 31 28 30 30 31 30 31 31 30 31 30 31
1996 31 29 31 30 31 30 31 31 30 31 30 31
1997 31 28 31 30 31 30 31 31 30 31 30 31
In [21]:
plt.figure(figsize=(8,8))
sns.heatmap(size,cmap=cmap)
_=plt.title('Number of recorded data points monthly from year 1995 to 2019 in Bombay')
colorbar = plt.gca().collections[0].colorbar

r = colorbar.vmax - colorbar.vmin
colorbar.set_ticks([colorbar.vmin + (r/n*(i+0.5)) for i in range(n)])
colorbar.set_ticklabels(range(size_min,size_max+1))
No description has been provided for this image

¶

February column (Month = 2) shows leap years, but in certain other months, you can see the color mismatches. This is because some data is missing for that particular month

Let's create a datetime column and drop the month, year and day column

In [22]:
bom['Date'] = bom[['Year','Month','Day']].apply(lambda row:'-'.join([str(row['Year']),str(row['Month']),str(row['Day'])]),axis=1)
bom['Date'] = pd.to_datetime(bom['Date'])
bom = bom.drop(columns=['Month','Day','Year']).set_index('Date')
bom.head(3)
Out[22]:
AvgTemperature
Date
1995-01-01 71.8
1995-01-02 72.0
1995-01-03 70.3
Plot of Average Temperature of the city of Mumbai(1995-2019)¶
In [23]:
px.line(data_frame=bom,color_discrete_sequence=['grey'],title="Daily Average Temperature - Bombay (1995-2019)")
In [24]:
roll_mean = bom.rolling(window=31).mean()
roll_mean2 = bom.rolling(window=365).mean()
fig = go.Figure()
fig.add_trace(go.Scatter(x=bom.index,y=bom['AvgTemperature'],marker=dict(color='grey'),name='Daily'))
fig.add_trace(go.Scatter(x=roll_mean.index,y=roll_mean['AvgTemperature'],marker=dict(color='red'),name='31DaysRolling'))
fig.add_trace(go.Scatter(x=roll_mean2.index,y=roll_mean2['AvgTemperature'],marker=dict(color='green'),name='365DaysRolling'))
fig.update_layout(dict(title='Rolling Mean'))
In [25]:
roll_mean = bom.rolling(window=31).std()
roll_mean2 = bom.rolling(window=365).std()
fig = go.Figure()
fig.add_trace(go.Scatter(x=roll_mean.index,y=roll_mean['AvgTemperature'],marker=dict(color='red'),name='31DaysRolling'))
fig.add_trace(go.Scatter(x=roll_mean2.index,y=roll_mean2['AvgTemperature'],marker=dict(color='green'),name='365DaysRolling'))
fig.update_layout(dict(title='Rolling Std'))

Seasonal Decompose¶

In [26]:
from statsmodels.tsa.seasonal import seasonal_decompose
In [27]:
decompose = seasonal_decompose(bom,period=365)
decompose.plot();
No description has been provided for this image

Dickey Fuller Test¶

Let's perform a Dickey Fuller test to verify the stationarity

Null hypothesis: The time series data is non stationary

In [28]:
from statsmodels.tsa.stattools import adfuller
In [29]:
adf = adfuller(x=bom['AvgTemperature'])
print('pvalue:',adf[1])
print('adf:',adf[0])
print('usedlag:',adf[2])
print('nobs:',adf[3])
print('critical_values:',adf[4])
print('icbest:',adf[5])
pvalue: 8.043434022164906e-16
adf: -9.356278759389957
usedlag: 36
nobs: 9065
critical_values: {'1%': -3.4310715833098833, '5%': -2.8618588932772657, '10%': -2.5669397418503186}
icbest: 33873.287352081155
The p-value is very small, close to 0. Hence we can reject our null hypothesis. The data is stationary¶

LSTM Model Creation¶

In [30]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM,Dense
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.callbacks import EarlyStopping
WARNING:tensorflow:From C:\Users\rbaud\anaconda3\Lib\site-packages\keras\src\losses.py:2976: The name tf.losses.sparse_softmax_cross_entropy is deprecated. Please use tf.compat.v1.losses.sparse_softmax_cross_entropy instead.

Train - Test - Split: Choosing 2019 data for testing and remaining for training¶
In [31]:
test = bom[bom.index>'2019']
train = bom[bom.index<'2019']
In [32]:
scaler = MinMaxScaler()
train = scaler.fit_transform(train)
test = scaler.transform(test)
In [33]:
time_steps = 20
features = 1

train_gen = TimeseriesGenerator(train,train,time_steps,batch_size=32)
test_gen = TimeseriesGenerator(test,test,time_steps,batch_size=32)
In [34]:
model = Sequential()
model.add(LSTM(64,activation='relu',input_shape=(time_steps,features),return_sequences=True))
model.add(LSTM(32,activation='relu'))
model.add(Dense(1,activation='relu'))
model.compile(optimizer='adam',loss='mse')

model.summary()
WARNING:tensorflow:From C:\Users\rbaud\anaconda3\Lib\site-packages\keras\src\backend.py:873: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

WARNING:tensorflow:From C:\Users\rbaud\anaconda3\Lib\site-packages\keras\src\optimizers\__init__.py:309: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 lstm (LSTM)                 (None, 20, 64)            16896     
                                                                 
 lstm_1 (LSTM)               (None, 32)                12416     
                                                                 
 dense (Dense)               (None, 1)                 33        
                                                                 
=================================================================
Total params: 29345 (114.63 KB)
Trainable params: 29345 (114.63 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Defining Callbacks¶
In [35]:
early_stop = EarlyStopping(patience=5)
Fitting the model¶
In [36]:
model.fit(x=train_gen,epochs=40,callbacks=[early_stop],validation_data=test_gen)

#Save the model
model.save('LSTMAvgTemp.h5')
Epoch 1/40
WARNING:tensorflow:From C:\Users\rbaud\anaconda3\Lib\site-packages\keras\src\utils\tf_utils.py:492: The name tf.ragged.RaggedTensorValue is deprecated. Please use tf.compat.v1.ragged.RaggedTensorValue instead.

273/273 [==============================] - 5s 11ms/step - loss: 0.4278 - val_loss: 0.4695
Epoch 2/40
273/273 [==============================] - 2s 9ms/step - loss: 0.4278 - val_loss: 0.4695
Epoch 3/40
273/273 [==============================] - 2s 9ms/step - loss: 0.4278 - val_loss: 0.4695
Epoch 4/40
273/273 [==============================] - 2s 9ms/step - loss: 0.4278 - val_loss: 0.4695
Epoch 5/40
273/273 [==============================] - 2s 9ms/step - loss: 0.4278 - val_loss: 0.4695
Epoch 6/40
273/273 [==============================] - 2s 9ms/step - loss: 0.4278 - val_loss: 0.4695
In [37]:
predict = model.predict(test_gen)
test_targets = test_gen.targets[test_gen.start_index:test_gen.end_index+1]
11/11 [==============================] - 0s 4ms/step
In [38]:
predict = scaler.inverse_transform(predict).ravel()
test_targets = scaler.inverse_transform(test_targets).ravel()

Model Evaluation¶

The predictions of the model for the year 2019 are plotted with the actual target values of 2019

In [39]:
from sklearn.metrics import mean_squared_error
In [40]:
_,ax=plt.subplots(1,1,figsize=(12,8))
sns.lineplot(x=range(len(predict)),y=predict,ax=ax,label='Predictions')
sns.lineplot(x=range(len(test_targets)),y=test_targets,ax=ax,label='Actual')
plt.legend()
_=plt.title('Comparison - Predictions vs Actual Target Temperatures - Bombay (2019)')
No description has been provided for this image
In [41]:
print('The RMSE Score is:',format(np.sqrt(mean_squared_error(predict,test_targets)),'.2f'))
The RMSE Score is: 20.01

Forecasting beyond 2019¶

Select the last 'time_steps' values of average temperature from test data. It can be used to forecast further.

In [42]:
data = bom.iloc[-time_steps:].to_numpy() #2D Array
data = scaler.transform(data)

#expand to include batch dimension
data = np.expand_dims(data,0)

#record the last date of observartion from the data
date = bom.index[-1]

date_store = bom.iloc[-time_steps:].index.to_list()

#forecasting
forecasts=10
for i in range(forecasts):
    predicted = model.predict(data[:,-20:,:])
    date = date+datetime.timedelta(days=1)
    data = np.append(data,[predicted],axis=1)
    date_store.append(date)
data = scaler.inverse_transform(data.reshape(1,-1))
forecast_df = pd.DataFrame(index=date_store[time_steps-1:],data={'AvgTemperature':data.ravel()[time_steps-1:]})
1/1 [==============================] - 0s 259ms/step
1/1 [==============================] - 0s 16ms/step
1/1 [==============================] - 0s 16ms/step
1/1 [==============================] - 0s 16ms/step
1/1 [==============================] - 0s 16ms/step
1/1 [==============================] - 0s 16ms/step
1/1 [==============================] - 0s 16ms/step
1/1 [==============================] - 0s 16ms/step
1/1 [==============================] - 0s 16ms/step
1/1 [==============================] - 0s 31ms/step
Let's print and plot the forecast_df¶

The first datapoint in forecast_df is not predicted. It is borrowed from the given data to ensure continuity while plotting

In [43]:
forecast_df
Out[43]:
AvgTemperature
2019-12-31 74.3
2020-01-01 63.4
2020-01-02 63.4
2020-01-03 63.4
2020-01-04 63.4
2020-01-05 63.4
2020-01-06 63.4
2020-01-07 63.4
2020-01-08 63.4
2020-01-09 63.4
2020-01-10 63.4
In [44]:
_,ax=plt.subplots(1,1,figsize=(12,8))
sns.lineplot(data=bom.iloc[-100:,:],y='AvgTemperature',x=bom.iloc[-100:,:].index,color='blue',ax=ax,label='AvgTemp - 2019 end')
sns.lineplot(data=forecast_df,y='AvgTemperature',x=forecast_df.index,color='red',ax=ax,label= 'AvgTemp - 2020 forecasted')
_=plt.title(f'Temperature Forecasting - {forecasts} days (2020)')
No description has been provided for this image

Seasonal ARIMA model¶

In [45]:
!pip3 install -q pmdarima
from pmdarima.arima import auto_arima
from dateutil.relativedelta import relativedelta
The data should be downsampled to train the Seasonal ARIMA model faster. So here we are downsampling the data from daily to monthly frequency. If downsampling is not performed, the model created will be very huge in this case¶
In [46]:
bom_monthly = bom.resample('M').mean()
bom_monthly.head(5)
Out[46]:
AvgTemperature
Date
1995-01-31 72.393548
1995-02-28 76.214286
1995-03-31 79.393333
1995-04-30 84.120000
1995-05-31 86.819355
In [47]:
data =[go.Scatter(x=bom.index,y=bom['AvgTemperature'],name='AvTemperature-Daily',marker=dict(color='grey')),go.Scatter(x= bom_monthly.index,y=bom_monthly['AvgTemperature'],name='AvgTemperature-Monthly',marker=dict(color='red'))]
fig = go.Figure(data)

buttons = [dict(label='Both',method='restyle',args=[{'visible':[True,True]}]),
           dict(label='Daily',method='restyle',args=[{'visible':[True,False]}]),
           dict(label='Monthly',method='restyle',args=[{'visible':[False,True]}])]

updatemenus=[dict(type="buttons",direction='down',buttons=buttons)]
fig.update_layout(updatemenus=updatemenus,title='Average Temperature - Daily and Monthly - Bombay')
The auto arima function can automatically determine arima parameters and model¶

The data is seasonal with the frequency of 12 months

In [48]:
#using default auto_arima arguments
model = auto_arima(bom_monthly,seasonal=True,m=12)
Storing date and temperature of the last row so that forecast plot can be made continuous¶
In [49]:
date = bom_monthly.index[-1]
last_val = bom_monthly.iloc[-1].to_numpy()
Forecasting for the first 5 months in 2020¶
In [50]:
forecasts=5
date_store = [(date + relativedelta(months=i)) for i in range(0,forecasts+1)]

predict = model.predict(forecasts)
predict =  np.append(last_val,predict)
forecast_df = pd.DataFrame(data=predict,index=date_store,columns=['AvgTemperature'])
forecast_df.head(5)
Out[50]:
AvgTemperature
2019-12-31 81.635484
2020-01-31 79.518710
2020-02-29 80.583247
2020-03-31 83.056672
2020-04-30 85.841799
Plotting results¶
In [51]:
_,ax=plt.subplots(1,1,figsize=(10,10))
sns.lineplot(data=forecast_df['AvgTemperature'],ax=ax,color='red')
sns.lineplot(data=bom_monthly['AvgTemperature'][-4*12:],ax=ax,color='blue')
_=plt.title(f'Forecast - Monthly Temperature Average (2020) - First {forecasts} months')
No description has been provided for this image
In [ ]: